GAMBLR - working with GAMBL data painlessly in R

If you haven’t already, clone the github repo: git clone git@github.com:morinlab/GAMBLR.git Run this vignette using the base directory of the GAMBLR repository as your working directory. See the Readme in the repository for more tips on getting set up.

# Load the GAMBLR package and other packages required by these examples. 
library(GAMBLR)
require(tidyverse)
require(maftools)
require(circlize)

The key level-1 GAMBL outputs are making their way into a MySQL (a.k.a. MariaDB) database. The most intuitive way to query the data is using the dbplyr R package. The initial connection and a method to determine what each table contains is shown below.

con <- DBI::dbConnect(RMariaDB::MariaDB(), dbname = "gambl_test")
# use DBI function to list the tables
all_table_names = DBI::dbListTables(con)
print(all_table_names)
#> [1] "bedpe_manta_hg19"    "biopsy_metadata"     "maf_slms3_hg19"     
#> [4] "maf_slms3_hg19_icgc" "maf_slms3_hg38"      "sample_metadata"    
#> [7] "seg_battenberg_hg19"
# peek at the contents of each table. 
for(table_name in all_table_names){
  table_db <- tbl(con, table_name)
  print(table_name)
  
}
#> [1] "bedpe_manta_hg19"
#> [1] "biopsy_metadata"
#> [1] "maf_slms3_hg19"
#> [1] "maf_slms3_hg19_icgc"
#> [1] "maf_slms3_hg38"
#> [1] "sample_metadata"
#> [1] "seg_battenberg_hg19"

Get the metadata, automagically joining both tables and dropping normals and RNA-seq samples.


my_metadata = get_gambl_metadata()

# reduce to some of the more useful metadata fields

my_metadata = my_metadata %>% select(sample_id,biopsy_id,myc_ba,cohort,pathology)

print(head(my_metadata))
#>                   sample_id            biopsy_id myc_ba       cohort pathology
#> 1 BLGSP-71-06-00001-01A-11D BLGSP-71-06-00001-01    POS BL_Pediatric        BL
#> 2 BLGSP-71-06-00004-01A-11D BLGSP-71-06-00004-01    POS BL_Pediatric        BL
#> 3 BLGSP-71-06-00013-01B-11D BLGSP-71-06-00013-01    POS BL_Pediatric        BL
#> 4 BLGSP-71-06-00080-01A-11D BLGSP-71-06-00080-01    POS BL_Pediatric        BL
#> 5 BLGSP-71-06-00019-01A-11D BLGSP-71-06-00019-01    POS BL_Pediatric        BL
#> 6 BLGSP-71-06-00008-01A-11D BLGSP-71-06-00008-01    NEG BL_Pediatric        BL

The largest tables in the database are those that contain mutations from MAF files. The tables are simply all the rows and columns from a given merged MAF file. There will likely be four MAF-derived tables in this database, one for each reference build for “GAMBL” and the complete “GAMBL + ICGC”. Although the names may change, currently the tables are implicitly GAMBL-only unless they contain “icgc” in their name. For example, maf_slms3_hg19_icgc contains all hg19 (grch37/hs37d5) mutations from GAMBL and icgc_dart/external cases.

hg19_maf = tbl(con,"maf_slms3_hg19_icgc")
mutation_counts = hg19_maf %>% group_by(Tumor_Sample_Barcode) %>% tally()
print(head(mutation_counts))
#> # Source:   lazy query [?? x 2]
#> # Database: mysql [rmorin@mysql01:NA/gambl_test]
#>   Tumor_Sample_Barcode       n
#>   <chr>                <int64>
#> 1 00-14595_tumorA         7139
#> 2 00-14595_tumorB        18212
#> 3 00-14595_tumorC        27633
#> 4 00-15201_tumorA         6423
#> 5 00-15201_tumorB         8255
#> 6 00-20702T               6335

mutation_counts  %>% ggplot() + geom_histogram(aes(x=n),bins=100) + xlim(c(10,25000))
#> Warning: Removed 142 rows containing non-finite values (stat_bin).
#> Warning: Removed 2 rows containing missing values (geom_bar).

Here’s a histogram of the total number of mutations per genome across all GAMBL cases. Pretty hard to interpret this without metadata. There are two metadata tables. Often we need to join them both to get all the information we want.

sample_meta = tbl(con,"sample_metadata") %>% filter(seq_type == "genome" & tissue_status == "tumour")
#if we only care about genomes, we can drop/filter anything that isn't a tumour genome
#The key for joining this table to the mutation information is to use sample_id. Think of this as equivalent to a library_id. It will differ depending on what assay was done to the sample. 

biopsy_meta = tbl(con,"biopsy_metadata") %>% select(-patient_id) %>% select(-pathology) %>% select(-time_point) %>% select(-EBV_status_inf)
# this table is keyed on biopsy_id. One biopsy can have more than one sample_id. This table must be joined to the sample table using biopsy_id. Because of some redundancy in columns between the two tables, I've dropped all redundant columns with the exception of biopsy_id.

all_meta = left_join(sample_meta,biopsy_meta,by="biopsy_id")
# IMPORTANT: the dbplyr package uses mysql queries under the hood to lazily retrieve the datat you need for each table on-the-fly. To properly use the efficiency of indexing and joins in MySQL, don't convert your tables into data frames until you're done all the necessary joins. 

Obtain all SV breakpoints (called by Manta) within a specific region of the genome and visualize them.


myc_locus_sv = get_manta_sv(region="8:128723128-128774067",pass=FALSE,with_chr_prefix=TRUE) 

# we can override default that requires SV to meet the Manta "Pass" filtering criterion
# here we are also asking for the chromosomes to be named with a chr prefix (for circlize compatability)

bed1 = myc_locus_sv %>% select(CHROM_A,START_A,END_A,tumour_sample_id)
bed2 = myc_locus_sv %>% select(CHROM_B,START_B,END_B,tumour_sample_id)
colnames(bed1)=c("chrom","start","end","sample_id")
colnames(bed2)=c("chrom","start","end","sample_id")

myc_locus_sv = myc_locus_sv %>% select(tumour_sample_id,CHROM_A,START_A,CHROM_B,START_B,STRAND_A,STRAND_B,VAF_tumour)

print(head(myc_locus_sv))
#>   tumour_sample_id CHROM_A   START_A CHROM_B   START_B STRAND_A STRAND_B
#> 1  00-15201_tumorB    chr8 128727469    chr8 128727743        +        +
#> 2  01-12047_tumorC    chr8 128748814   chr14 106994498        -        +
#> 3  01-14774_tumorA    chr8 128727469    chr8 128727743        +        +
#> 4  01-23117_tumorB    chr8 128749370   chr14 106323836        -        +
#> 5  01-23117_tumorB    chr8 128749348   chr14 106323928        +        -
#> 6  02-18356_tumorB    chr8 128727463    chr8 128727752        +        +
#>   VAF_tumour
#> 1      0.151
#> 2      0.429
#> 3      0.163
#> 4      0.345
#> 5      0.154
#> 6      0.169

circos.initializeWithIdeogram()
circos.genomicLink(bed1, bed2,col = rand_color(nrow(bed1),transparency=0.5))

Working with all mutations from a sample.

# we can also query the database to get a MAF per patient for patient-centric visualizations. 
# note that here I'm not restricting to only coding variants
example_dlbcl = hg19_maf %>% filter(Tumor_Sample_Barcode == "HTMCP-01-06-00422-01A-01D")
example_dlbcl_df = example_dlbcl %>% as.data.frame()
example_dlbcl_maf = read.maf(example_dlbcl_df)
#> -Validating
#> -Silent variants: 4552 
#> -Summarizing
#> -Processing clinical data
#> --Missing clinical data
#> -Finished in 0.125s elapsed (0.411s cpu)
rainfallPlot(example_dlbcl_maf)
#> Processing HTMCP-01-06-00422-01A-01D..

You probably will be more interested in visualizing all mutations across a given cohort (or every genome). This can only be done in relatively small regions. Here’s a way to visualize the non-coding mutation pattern across a region of interest.


# set up some coordinates to annotate in your plot (optional)
mybed = data.frame(start=c(128806578,128805652,128748315), end=c(128806992,128809822,128748880), name=c("TSS","enhancer","MYC-e1"))

# get the mutations within a region of interest
my_mutations = get_ssm_by_region(region="chr8:128,743,606-128,820,015")


ashm_rainbow_plot(mutations_maf=my_mutations,metadata=my_metadata,bed=mybed)

Get all CN segments that overlap with a gene of interest and annotate them using the metadata.


my_segments = get_cn_segments(chromosome="4",qstart=83274467,qend=83295149)
print(head(my_segments))
#>             ID chrom    start       end LOH_flag log.ratio
#> 1  1064-01-1TD  chr4    11870 190789536        0         0
#> 2 1182-01-03TD  chr4    69567 190785390        0         1
#> 3 1227-02-02TD  chr4 62724209 190790246        0         0
#> 4 1242-02-01BD  chr4    70761 190790246        0         0
#> 5 1296-01-03TD  chr4    98530 190762427        0         0
#> 6 1318-02-04BD  chr4    81303 190784332        0         0

deleted_segments = my_segments %>% filter(log.ratio<0) #use some lower value if you want to be more stringent

annotated_segments = left_join(deleted_segments,my_metadata,by=c("ID" = "sample_id"))

annotated_segments %>% pull(cohort) %>% table()
#> .
#>   CLL_GenomeCanada        DLBCL_ctDNA     DLBCL_Gascoyne DLBCL_GenomeCanada 
#>                  4                  2                  5                  9 
#>        DLBCL_HTMCP         DLBCL_ICGC  DLBCL_LSARP_Trios        DLBCL_Marra 
#>                  3                  5                 53                 10 
#>    FL_GenomeCanada            FL_ICGC          FL_Kridel    MALY_Other_ICGC 
#>                  1                  4                  6                  3 
#>        MM_mmsanger 
#>                  4

Retrieve all somatic mutations in a given gene and visualize them using MAFtools.


all_ssms = get_ssm_by_gene(gene_symbol=c("CCND3"),coding_only = TRUE)

all_ssms = all_ssms %>% as.data.frame()

# make a MAFtools object and plot a lollipop plot

maf_obj = read.maf(all_ssms)
#> -Validating
#> -Silent variants: 2 
#> -Summarizing
#> -Processing clinical data
#> --Missing clinical data
#> -Finished in 0.070s elapsed (0.409s cpu)
lollipopPlot(maf_obj,gene="CCND3")
#> Assuming protein change information are stored under column HGVSp_Short. Use argument AACol to override if necessary.
#> 4 transcripts available. Use arguments refSeqID or proteinID to manually specify tx name.
#>     HGNC    refseq.ID   protein.ID aa.length
#> 1: CCND3 NM_001136125 NP_001129597       220
#> 2: CCND3    NM_001760    NP_001751       292
#> 3: CCND3 NM_001136017 NP_001129489       211
#> 4: CCND3 NM_001136126 NP_001129598        96
#> Using longer transcript NM_001760 for now.

maf_data = get_coding_ssm(limit_cohort=c("BL_Adult","BL_Pediatric","BL_ICGC"))
maf_metadata = get_gambl_metadata() %>% select(sample_id,cohort,sex)
colnames(maf_metadata)[1] = "Tumor_Sample_Barcode"

maf = read.maf(maf_data,clinicalData = maf_metadata)
#> -Validating
#> --Removed 3 duplicated variants
#> -Silent variants: 10334 
#> -Summarizing
#> -Processing clinical data
#> -Finished in 3.003s elapsed (8.945s cpu)

oncoplot(maf,clinicalFeatures = c("sex","cohort"),sortByAnnotation = TRUE,genesToIgnore = c("TTN"))